In [1]:
import requests
from bs4 import BeautifulSoup
from mechanize import Browser
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
In [2]:
# url = 'https://www.tripadvisor.com/Attraction_Review-g60878-d3184389-Reviews-Chihuly_Garden_and_Glass-Seattle_Washington.html#REVIEWS'
def get_reviews(response):
# response = requests.get(url)
soup = BeautifulSoup(response, 'html.parser')
entries = soup.findAll('div', {'class': 'entry'})
reviews = [entry.text.replace('\n', '') for entry in entries]
return reviews
In [3]:
def mechanize_reviews(url):
br = Browser() # Initialize browser object
br.set_handle_robots(False) # try this if you get a 'disallowed by robots.txt' error
# br.addheaders = [('User-agent', 'Firefox')] # sometimes you need this line
br.open(url) # Retrieve the requested page
br.select_form(nr=0)
reviews = []
for link in br.links():
if 'Attraction_Review' in str(link):
data = br.follow_link(link)
reviews = get_reviews(data)
if len(reviews) > 10:
return reviews
return reviews
In [5]:
url = 'https://www.tripadvisor.com'
places = ['Portland, OR', 'San Francisco, CA', 'Seattle, WA']
chromedriver = '/Users/sydneydecoto/bin/chromedriver'
for place in places:
# Initialize a chrome driver and go to url
driver = webdriver.Chrome(chromedriver)
driver.get(url)
# wait for page to load, time out after 10 seconds
searchbox = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'GEO_SCOPED_SEARCH_INPUT')))
searchbox.send_keys(place)
mainsearch = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'mainSearch')))
mainsearch.send_keys('Things to Do')
driver.find_elements_by_class_name('inner')[0].click()
driver.switch_to_alert() # ignore the popup
reviews = mechanize_reviews(driver.current_url)
# print reviews
driver.quit()
In [17]:
br = Browser() # Initialize browser object
br.set_handle_robots(False) # try this if you get a 'disallowed by robots.txt' error
br.addheaders = [('User-agent', 'Firefox')] # sometimes you need this line
url = 'https://seattle.craigslist.org/'
br.open(url)
for form in br.forms():
print form
In [ ]: